Analysis of First Targeted Sequencing Run on Sequel 1:
1. On-Target Rate
2. Raw Coverage
+ Number of Transcripts per Gene identified
3. Filtering by ToFu
4. Number of Isoforms per Gene identified
# Input IsoSeq Paths
CCS_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/CCS"
LiMA_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/LIMA"
REFINE_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/REFINE"
CLUSTER_input_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/IsoSeq/CLUSTER"
## Input CCS, LIMA, REFINE files
# CCS, LIMA summary
CCS <- read.csv(paste0(CCS_input_dir, "/Batched_CCS_output.csv"), header = T)
LIMA <- read.csv(paste0(LiMA_input_dir, "/Batched_LIMA_summary.csv"), header = T)
# REFINE summary input
REFINE_json_list <- list.files(paste0(REFINE_input_dir), pattern = "flnc.filter_summary.json", full.names = T) %>% .[!grepl("Targeted_Seq",.)]
REFINE_list <- lapply(REFINE_json_list , function(x) as.data.frame(fromJSON(file = x)))
names(REFINE_list) <- list.files(paste0(REFINE_input_dir), pattern = "flnc.filter_summary.json") %>% .[!grepl("Targeted_Seq",.)]
for(i in 1:length(names(REFINE_list))){REFINE_list[[i]]$Sample <- names(REFINE_list)[[i]]}
# CLUSTER files
# do not include batched report Targeted_Seq_1.clustered.cluster_report.csv
CLUSTER_input_files <- list.files(paste0(CLUSTER_input_dir), pattern = "clustered.cluster_report.csv", full.names = T) %>% .[!grepl("Targeted_Seq",.)]
CLUSTER_list <- lapply(CLUSTER_input_files, function(x) read.csv(x))
names(CLUSTER_list) <- list.files(paste0(CLUSTER_input_dir), pattern = "clustered.cluster_report.csv") %>% .[!grepl("Targeted_Seq",.)]
for(i in 1:length(names(CLUSTER_list))){CLUSTER_list[[i]]$Sample <- names(CLUSTER_list)[[i]]}
source("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/Scripts/IsoSeq_Tg4510/1_Transcriptome_Annotation/Isoseq3_QC/RunStats_Function.R")
Reads <- number_of_reads()
Merged_CLUSTER <- read.csv("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/All_Targeted_Merged/CLUSTER/All_Targeted_Merged.clustered.cluster_report.csv")
Reads <- Reads %>% mutate(Batch = word(Reads$variable, c(3), sep = fixed("_"))) %>%
full_join(., targetedpheno, by = c("sample" = "Sample")) %>%
unite("Batch", Batch.x,Batch.y, na.rm = TRUE) %>%
mutate(Batch = recode(Batch, "3b" = "3", "3a" = "3 (partial run)")) %>%
select(Description,value,sample,Batch) %>%
bind_rows(data.frame(Description = "Clustered reads(Transcripts)",value = as.numeric(as.character(nrow(Merged_CLUSTER))),
sample = "Merged", Batch = "Merged"))
pIsoSeq <- ggplot(Reads, aes(x = Description, y = value, colour = Batch, group = Batch)) +
geom_line() + geom_point() + mytheme + theme(legend.position = "bottom") + labs(x = "", y = "Number of Reads (Thousands)") +
scale_y_continuous(labels = unit_format(unit = "", scale = 1e-3)) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
pIsoSeqUsing L.Tseng’s script (https://github.com/Magdoll/cDNA_Cupcake/wiki/Targeted-Iso-Seq-QC-Wiki) to calculate the on-target rate from polished.fa file, SAM alignment, and probe BED file.
Batch 2 and Batch 3 had more samples (n = 9) than batch 6, therefore increased propensity to sequence targeted genes
Note: This is prior to Tofu-collapse, therefore some of different FL transcripts can still be derived from the same isoform
Mapping_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/Post_IsoSeq/MAPPING"
Probes_input <- list.files(path = Mapping_dir, pattern = "fasta.sam.probe_hit.txt", full.names = T)
Probes_files <- lapply(Probes_input, function(x) read.table(x, header=T, as.is=T, sep="\t"))
names(Probes_files ) <- list.files(path = Mapping_dir, pattern = "fasta.sam.probe_hit.txt")
Probes <- merge(ldply(Probes_files , function(x) nrow(x)),
ldply(Probes_files , function(x) length(which(x$num_base_overlap != "0"))),by = ".id") %>%
`colnames<-`(c("file", "Total_mapped_reads", "reads_probe_hit")) %>%
mutate(perc = reads_probe_hit/Total_mapped_reads * 100) %>%
mutate(sample = word(.$file, c(1), sep = fixed("."))) %>%
full_join(., targetedpheno, by = c("sample" = "Sample"))
pProbes<- ggplot(Probes, aes(x = as.factor(Batch), y = perc, fill = as.factor(Phenotype))) + geom_boxplot() +
geom_point(aes(colour = as.factor(Phenotype)), position = position_jitterdodge()) +
mytheme + labs(y = "On-Target Rate", x = "Batch") +
scale_fill_manual(values = c(alpha(label_colour("TG"),0.4),alpha(label_colour("WT"),0.4)), name = "Phenotype") +
scale_colour_manual(values = c(label_colour("TG"),label_colour("WT")), guide=FALSE)
pProbesMerged_Mapping_dir <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/All_Targeted_Merged/MAPPING/"
Merged_probes_files <- read.table(paste0(Merged_Mapping_dir,"All_Targeted_Merged.fasta.sam.probe_hit.txt"),header=T, as.is=T, sep="\t")
#sum(Reads[Reads$Description == "Clustered reads(Transcripts)","value"])
#nrow(Merged_CLUSTER)A total of 50634 FL transcripts identified. Only an on-target rate of 26%, i.e. % of FL transcripts with at least one of its bases matching to any primer.
merged data clustering, cupcake collapse, cupcake filtering, sqanti filtering filter to only relevant genes, and number of isoforms
All_Targeted_Merged <- "/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/All_Targeted_Merged"
Merged_PostIso <- list(read.csv(paste0(All_Targeted_Merged,"/CLUSTER/All_Targeted_Merged.clustered.cluster_report.csv")), # Cluster
read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.abundance.txt"), header = T), # Cupcake collapse
read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.read_stat.txt"), header = T),
read.table(paste0(All_Targeted_Merged,"/TOFU/All_Targeted_Merged.collapsed.filtered.abundance.txt"), as.is = T, sep = "\t", header = T),
read.table(paste0(All_Targeted_Merged,"/SQANTI/All_Targeted_Merged.collapsed.filtered_classification.filtered_lite_classification.txt"),
as.is = T, sep = "\t", header = T)
)
names(Merged_PostIso) <- c("cluster","cupcake","cupcake_readstat","cupcake_filter","sqanti_filter")
colnames(Merged_probes_files)[6] <- "Probes"
Merged_probes_files$Gene <- word(Merged_probes_files$Probes,c(3), sep = fixed ('_'))
Merged_probes_files$Gene <- word(Merged_probes_files$Gene,c(1), sep = fixed ('('))
Merged_probes_files$Gene[Merged_probes_files$Gene == "27642461"] <- "MAPT"
Merged_probes_files$Gene[Merged_probes_files$Gene == "27642460"] <- "ANK1"
# Filter by targeted genes
# targeted_transcripts = transcript/x of targeted genes (differented by having overlap with probes)
# targeted_read_id = mx/x/ccs of targeted genes (origianal reads)
# targeted_pbid = pbid of targeted genes (from readstat file after filtering for targeted_read_id)
targeted <- Merged_probes_files %>% filter(Merged_probes_files$num_base_overlap != "0") %>% select(read_id,Gene) %>% rename(transcript_id = read_id) %>%
left_join(.,Merged_PostIso$cluster, by = c("transcript_id"= "cluster_id")) %>%
left_join(.,Merged_PostIso$cupcake_readstat, by = c("read_id"="id") )
# Example of unmapped transcript but probed
#Merged_probes_files %>% filter(read_id == "transcript/24")
#Merged_PostIso$cluster %>% filter(cluster_id == "transcript/24")
#Merged_PostIso$cupcake_readstat %>% filter(id == "m54082_200731_163617/50594276/ccs")
tgcluster <- targeted %>% group_by(transcript_id,Gene) %>% tally() %>% # multiple same transcript_id from clustering ccs reads
group_by(Gene) %>% tally() %>% mutate(type = "Clustered")
# PBids = NA for non-targeted genes
tgcupcake <- Merged_PostIso$cupcake %>% left_join(., unique(targeted[,c("pbid","Gene")])) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene)) %>% mutate(type = "Cupcake")
tgcupcakefilter <- Merged_PostIso$cupcake_filter %>% left_join(., unique(targeted[,c("pbid","Gene")], by = "pbid")) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene)) %>% mutate(type = "Cupcake Filter")
# if use probes, also misannotation of other genes nearby "Gm49227"
#tgsqfilter <- Merged_PostIso$sqanti_filter %>% left_join(., unique(targeted[,c("pbid","Gene")]), by = c("isoform" = "pbid")) %>% group_by(Gene) %>% tally() %>% filter(!is.na(Gene))
tgsqfilter <- Merged_PostIso$sqanti_filter %>% filter(toupper(associated_gene) %in% targeted$Gene) %>% group_by(associated_gene) %>% tally() %>% mutate(type = "Sqanti filter") %>% mutate(Gene = toupper(associated_gene))
pNumfil <- bind_rows(tgcluster, tgcupcake, tgcupcakefilter, tgsqfilter) %>%
ggplot(., aes(x = Gene, y = n, fill = type)) + geom_bar(stat = "identity") +
mytheme + labs(x = "", y = "Number of Transcripts (Thousand)") + theme(legend.position = c(0.8,0.8)) +
scale_fill_discrete(name = "", labels = c("Iso-Seq Cluster","Cupcake Collapse","Cupcake Filter","SQANTI filter")) +
scale_y_continuous(labels = ks) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
pNumfilplot_sq_fil <- function(sq_df){
p <- sq_df %>% filter(toupper(associated_gene) %in% targeted$Gene) %>%
select(isoform, structural_category, associated_gene, associated_transcript, subcategory) %>%
mutate(transcript_type = ifelse(.$associated_transcript == "novel","Novel","Annotated")) %>%
mutate(transcript_type = factor(transcript_type, levels = c("Novel","Annotated"))) %>%
ggplot(., aes(x = associated_gene, fill = transcript_type)) + geom_bar() +
mytheme + labs(x = "", y = "Number of Isoforms") +
scale_fill_discrete(name = "Transcript Classification") +
theme(legend.position = c(0.9,0.9)) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
return(p)
}
pNum1 <- plot_sq_fil(Merged_PostIso$sqanti_filter)
pNum1pNum2 <- Merged_PostIso$sqanti_filter %>% filter(toupper(associated_gene) %in% targeted$Gene) %>%
mutate(structural_category = factor(structural_category, levels = c("novel_not_in_catalog","novel_in_catalog",
"incomplete-splice_match","full-splice_match"))) %>%
ggplot(., aes(x = associated_gene, fill = structural_category)) + geom_bar() +
mytheme + labs(x = "", y = "Number of Isoforms") +
theme(legend.position = c(0.9,0.9)) +
scale_fill_manual(name = "Transcript Classification", labels = c("NNC","NIC","ISM","FSM"),
values = c(alpha("#F8766D",0.8),alpha("#F8766D",0.3),alpha("#00BFC4",0.8),alpha("#00BFC4",0.3))) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
pNum2plot_subcategory <- function(structural_category_input){
p <- Merged_PostIso$sqanti_filter %>% filter(toupper(associated_gene) %in% targeted$Gene) %>%
select(associated_gene,structural_category,subcategory) %>%
filter(structural_category == structural_category_input) %>%
ggplot(., aes(x = associated_gene, fill = subcategory)) + geom_bar() + mytheme +
labs(y = "Number of Transcripts", x = "") +
theme(legend.position = c(0.9,0.9),
axis.text.x = element_text(angle = 45, hjust=1))
return(p)
}
plot_subcategory("incomplete-splice_match") B1sq <- read.table("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/Post_IsoSeq/SQANTI/Targeted_Seq_1/Targeted_Seq_1.collapsed.filtered_classification.filtered_lite_classification.txt", header = T)
B1sq %>% filter(toupper(associated_gene) %in% targeted$Gene) %>%
mutate(transcript_type = ifelse(.$associated_transcript == "novel","Novel","Annotated")) %>%
mutate(transcript_type = factor(transcript_type, levels = c("Novel","Annotated"))) %>%
ggplot(., aes(x = associated_gene, fill = transcript_type)) + geom_bar() +
mytheme + labs(x = "", y = "Number of Isoforms") +
scale_fill_discrete(name = "Transcript Classification") +
theme(legend.position = c(0.9,0.9)) +
theme(axis.text.x = element_text(angle = 45, hjust=1))## isoform chrom strand length exons structural_category associated_gene
## 1 PB.1357.2 chr7 - 2140 5 novel_in_catalog Cd33
## 2 PB.1357.3 chr7 - 1510 6 full-splice_match Cd33
## associated_transcript ref_length ref_exons diff_to_TSS diff_to_TTS
## 1 novel 1962 7 NA NA
## 2 ENSMUST00000205503.1 5728 6 60 4158
## diff_to_gene_TSS diff_to_gene_TTS subcategory RTS_stage all_canonical
## 1 -10 2 intron_retention FALSE canonical
## 2 -10 0 multi-exon FALSE canonical
## min_sample_cov min_cov min_cov_pos sd_cov FL n_indels n_indels_junc bite
## 1 NA NA NA NA 2 NA NA TRUE
## 2 NA NA NA NA 6 NA NA TRUE
## iso_exp gene_exp ratio_exp FSM_class coding ORF_length CDS_length CDS_start
## 1 NA NA NA C coding 149 450 326
## 2 NA NA NA C coding 334 1005 18
## CDS_end CDS_genomic_start CDS_genomic_end predicted_NMD perc_A_downstream_TTS
## 1 775 43532197 43529737 FALSE 15
## 2 1022 43533094 43528862 FALSE 15
## seq_A_downstream_TTS dist_to_cage_peak within_cage_peak dist_to_polya_site
## 1 GAGGTGTGGCCTTGTTGAAT -10 True NA
## 2 AGGAGGTGTGGCCTTGTTGA -10 True NA
## within_polya_site polyA_motif polyA_dist
## 1 NA <NA> NA
## 2 NA <NA> NA
## isoform chrom strand length exons structural_category associated_gene
## 1 PB.1357.3 chr7 - 1510 6 full-splice_match Cd33
## associated_transcript ref_length ref_exons diff_to_TSS diff_to_TTS
## 1 ENSMUST00000205503.1 5728 6 60 4158
## diff_to_gene_TSS diff_to_gene_TTS subcategory RTS_stage all_canonical
## 1 -10 0 multi-exon FALSE canonical
## min_sample_cov min_cov min_cov_pos sd_cov FL n_indels n_indels_junc bite
## 1 NA NA NA NA 6 NA NA TRUE
## iso_exp gene_exp ratio_exp FSM_class coding ORF_length CDS_length CDS_start
## 1 NA NA NA C coding 334 1005 18
## CDS_end CDS_genomic_start CDS_genomic_end predicted_NMD perc_A_downstream_TTS
## 1 1022 43533094 43528862 FALSE 15
## seq_A_downstream_TTS dist_to_cage_peak within_cage_peak dist_to_polya_site
## 1 AGGAGGTGTGGCCTTGTTGA -10 True NA
## within_polya_site polyA_motif polyA_dist
## 1 NA <NA> NA
B1readstat <- read.table("/gpfs/mrc0/projects/Research_Project-MRC148213/sl693/IsoSeq/Targeted_Transcriptome/Post_IsoSeq/TOFU/Targeted_Seq_1.collapsed.read_stat.txt", header = T)
B1readstat %>% filter(pbid == "PB.1357.3")## id length is_fl stat pbid
## 1 m54082_191116_131337/47644765/ccs NA Y unique PB.1357.3
## 2 m54082_191116_131337/57737675/ccs NA Y unique PB.1357.3
## 3 m54082_191116_131337/44171880/ccs NA Y unique PB.1357.3
## 4 m54082_191116_131337/25101285/ccs NA Y unique PB.1357.3
## 5 m54082_191116_131337/31195690/ccs NA Y unique PB.1357.3
## 6 m54082_191116_131337/40043410/ccs NA Y unique PB.1357.3
## id length is_fl stat pbid
## 1 m54082_191116_131337/47644765/ccs NA Y unique PB.7402.33
## id length is_fl stat pbid
## 1 m54082_191116_131337/57737675/ccs NA Y unique PB.7402.33
## [1] pbid count_fl count_nfl count_nfl_amb norm_fl
## [6] norm_nfl norm_nfl_amb
## <0 rows> (or 0-length row.names)
## [1] isoform chrom strand
## [4] length exons structural_category
## [7] associated_gene associated_transcript ref_length
## [10] ref_exons diff_to_TSS diff_to_TTS
## [13] diff_to_gene_TSS diff_to_gene_TTS subcategory
## [16] RTS_stage all_canonical min_sample_cov
## [19] min_cov min_cov_pos sd_cov
## [22] FL n_indels n_indels_junc
## [25] bite iso_exp gene_exp
## [28] ratio_exp FSM_class coding
## [31] ORF_length CDS_length CDS_start
## [34] CDS_end CDS_genomic_start CDS_genomic_end
## [37] predicted_NMD perc_A_downstream_TTS seq_A_downstream_TTS
## [40] dist_to_cage_peak within_cage_peak dist_to_polya_site
## [43] within_polya_site polyA_motif polyA_dist
## [46] FL.K17 FL.K18 FL.K19
## [49] FL.K20 FL.K21 FL.K23
## [52] FL.K24 FL.L18 FL.L22
## [55] FL.M21 FL.O18 FL.O22
## [58] FL.O23 FL.P19 FL.Q17
## [61] FL.Q18 FL.Q20 FL.Q21
## [64] FL.Q23 FL.S18 FL.S19
## [67] FL.S23 FL.T18 FL.T20
## <0 rows> (or 0-length row.names)
## [1] pbid count_fl count_nfl count_nfl_amb norm_fl
## [6] norm_nfl norm_nfl_amb
## <0 rows> (or 0-length row.names)
Pipeline for analysis
# after TAMA filtering and subset of SQANTI filtered dataset, generated with same pipeline (but added TAMA filtering at the end) and using 99% coverage threshold for cupcake
Merged_PostIso_alt <- list(
read.table(paste0(All_Targeted_Merged,"/Alternative_Pipeline/SQANTI/All_Targeted_Merged.collapsed_classification.filtered_lite_classification.txt"), header = T),
read.table(paste0(All_Targeted_Merged,"/Alternative_Pipeline/SQANTI_TAMA_FILTER/All_Targeted_Merged_sqantitamafiltered.classification.txt"), header = T)
)
names(Merged_PostIso_alt) <- c("sqanti_filter", "sqanti_tama_filter")